* Author: Joe Tatarka
* Name: 2_additional_analysis.do
* Purpose: Recreate the extra little statistics in the paper such as correlations and small data work with Homebase and the National Restaurant Association 

* Set Global File Paths [INSERT YOUR PACKAGE PATH HERE]
global root = "T:/service_industries/replication_package"

global raw_root = "${root}/datasets/raw"
global intermediate_root = "${root}/datasets/intermediate"
global built_root = "${root}/datasets/built"
global exhibits_root = "${root}/exhibits"

*******************************************************************************
**** 1. Compare SafeGraph with National Restaurant Association Data
*******************************************************************************
** Load in Data
import excel "${raw_root}/national_restaurant_association/rpi_march_13_2024.xls", clear

** Clean the data 

keep A E F G

rename A year_month
rename E sales_index 
rename F visits_index 
rename G labor_index 

drop in 1/6

destring sales_index visits_index labor_index, replace 
drop if year_month == ""

gen year_month_gs = monthly(substr(year_month,3,7),"MY")
format year_month_gs %tm

keep if year_month_gs >= monthly("jan2019", "MY") & year_month_gs != .

drop year_month 
order year_month_gs , first

* save as tempfile 
tempfile nra_data 
save `nra_data', replace

*****************************************
***** Bring in all of the sg/visits data 

* use in 1/1000 using "${data_root}/main_build.dta", clear
use "${built_root}/main_build.dta", clear

label variable year_month_gs "Year-month"
keep placekey year_month_gs nominal_spend emp visits
sort placekey year_month_gs

** xtset 
egen id = group(placekey)
xtset id year_month_gs

*** Create year lagged variables 
gen lagged_spend = L12.nominal_spend
gen lagged_emp = L12.emp
gen lagged_visits = L12.visits

***** Create dummies to check if spending/visits/emp is more or less than a year ago
gen more_spend_dummy = (nominal_spend > lagged_spend)
replace more_spend_dummy = . if (nominal_spend == . | lagged_spend == .)
gen less_spend_dummy = (nominal_spend < lagged_spend)
replace less_spend_dummy = . if (nominal_spend == . | lagged_spend == .)

gen more_emp_dummy = (emp > lagged_emp)
replace more_emp_dummy = . if (emp == . | lagged_emp == .)
gen less_emp_dummy = (emp < lagged_emp)
replace less_emp_dummy = . if (emp == . | lagged_emp == .)

gen more_visits_dummy = (visits > lagged_visits)
replace more_visits_dummy = . if (visits == . | lagged_visits == .)
gen less_visits_dummy = (visits < lagged_visits)
replace less_visits_dummy = . if (visits == . | lagged_visits == .)

*** Take average by month to create a diffusion index
collapse (mean) more_spend_dummy more_visits_dummy more_emp_dummy less_spend_dummy less_visits_dummy less_emp_dummy, by(year_month_gs)

drop if more_spend_dummy ==. | more_visits_dummy == . | more_emp_dummy == .

** Create same indexes as National Restaurant Association 
gen sg_sales_index = 10*(more_spend_dummy - less_spend_dummy) + 100
gen sg_visits_index = 10*(more_visits_dummy - less_visits_dummy) + 100
gen sg_emp_index = 10*(more_emp_dummy - less_emp_dummy) + 100

*************************************
**** Merge with NRA data 
merge 1:1 year_month_gs using `nra_data', nogen keep(1 3)

*********************************
*** Make Graphs 
label var year_month_gs "Year-Month"

*** We want to examine the monthly differences between the indexes, and show correlation 
* create month to month differences: 

tsset year_month_gs
foreach var of varlist sales_index visits_index labor_index  {
	gen diff_`var' = D.`var'
	label var diff_`var' "NRA"
}

foreach var of varlist sg_sales_index sg_visits_index sg_emp_index {
	gen diff_`var' = D.`var'
	label var diff_`var' "Safegraph"
}

****** Graph and find correlation coefficient 
corr diff_sales_index diff_sg_sales_index
local correlation = r(C)[2,1]
local fmtcorr : display %4.3f `correlation'
graph twoway line diff_sales_index diff_sg_sales_index year_month_gs, title("Same-Store Sales Index, Monthly Change") ytitle(Change in Index from Previous Month) subtitle("Correlation Coefficient: `fmtcorr'") ylabel(-15(5)15) legend(cols(2) position(6))
graph export "${exhibits_root}/additional_analysis/1_1_nra_sales_diff.pdf", replace

corr diff_visits_index diff_sg_visits_index
local correlation = r(C)[2,1]
local fmtcorr : display %4.3f `correlation'
graph twoway line diff_visits_index diff_sg_visits_index year_month_gs, title("Same-Store Customer Traffic Index, Monthly Change") ytitle(Change in Index from Previous Month) subtitle("Correlation Coefficient: `fmtcorr'") ylabel(-15(5)15) legend(cols(2) position(6))
graph export "${exhibits_root}/additional_analysis/1_2_nra_visits_diff.pdf", replace

corr diff_labor_index diff_sg_emp_index
local correlation = r(C)[2,1]
local fmtcorr : display %4.3f `correlation'
graph twoway line diff_labor_index diff_sg_emp_index year_month_gs, title("Same-Store Labor Index, Monthly Change") ytitle(Change in Index from Previous Month) subtitle("Correlation Coefficient: `fmtcorr'") ylabel(-15(5)15) legend(cols(2) position(6))
graph export "${exhibits_root}/additional_analysis/1_3_nra_emp_diff.pdf", replace

********************************************
***** 2. Homebase Analysis 
********************************************
*** Homebase is a employee scheduling program, we use their data to see the average length of restaurant employee shifts. 
*** I am going to use their data going from Jan 2018 - Dec 2022.

** Get data from Homebase Limited service eating places
use "${built_root}/homebase_build.dta", clear

*** label things
gen agg_hours_share = less_4_hours/agg_hours
label var agg_hours_share "Share of Agg Hours from Shifts Less than 4 Hours"
label var hours_worked "Avg Shift Length"
label var agg_hours "Aggregate Hours"
label var shift_less_4 "Share of Shifts less than 4 hours"
label var year_month_gs "Year-Month"

replace agg_hours = agg_hours/1000000 // put into millions

graph twoway line hours_worked year_month_gs, title("1. Avg Shift Length") subtitle("Limited-Service") ytitle("Avg Shift Length") ylabel(0(2)8) xlabel(696(12)758)
graph export "${exhibits_root}/additional_analysis/2_1_homebase.pdf", replace

graph twoway line shift_less_4 year_month_gs, title("2. Share of Shifts Less than 4 Hours") subtitle("Limited-Service") ytitle("Share of Shifts less than 4 Hours") ylabel(0(0.05)0.2) xlabel(696(12)758)
graph export "${exhibits_root}/additional_analysis/2_2_homebase.pdf", replace

graph twoway line agg_hours_share year_month_gs , title("3. Share of Agg hours from <4 hr shifts") subtitle("Limited-Service") ytitle("Share of Agg hours from <4 hr shifts") ylabel(0(0.01)0.06) xlabel(696(12)758)
graph export "${exhibits_root}/additional_analysis/2_3_homebase.pdf", replace

***********************
*** 3. Checking Correlations
***********************
**** Check Year-over-year growth rate correlation of Figure A1, industry vs limited service real sales per employee 
use "${built_root}/aggregates_build.dta", clear 
tsset year_month_gs

sort year_month_gs
foreach var of varlist prod_food_nsa prod_limited_food_cpi  {
	gen `var'_y_g = 100*(`var' -l12.`var')/l12.`var'
}
*** Industry Productivity and Limited Service Productivity, Not Seasonally Adjusted
corr prod_food_nsa_y_g prod_limited_food_cpi_y_g

****** Correlations of Monthly Growth Rates of comparisons of SafeGraph Aggregates and BLS/Census Aggregates
sort year_month_gs
foreach var of varlist spend_sg visits_sg emp_sg emp_limited spend_limited {
	gen `var'_g = 100*(`var' -l.`var')/l.`var'
}

********* Monthly growth rates correlations 
** SG Visits and Census Sales
corr visits_sg_g spend_limited_g 
* SG Sales and Census Sales
corr spend_sg_g spend_limited_g
* SG Emp and BLS Emp
corr emp_sg_g emp_limited_g

******************************
**** Entry and Exit
******************************
** Compare SG Sample Entry and Exit to Census Business Dynamic Statistics (BDS) annual rates
import delimited "${raw_root}/business_dynamic_statistics/bds2022_vcn4.csv", clear
keep if vcnaics4 == 7225
keep if year >= 2018
keep year vcnaics4 estabs estabs_entry estabs_entry_rate estabs_exit estabs_exit_rate
destring year vcnaics4 estabs estabs_entry estabs_entry_rate estabs_exit estabs_exit_rate, replace

tempfile entry_exit
save `entry_exit', replace 

****** Load in SG Data 
use placekey year_month_gs using "${built_root}/main_build.dta", clear 
gen year = 1960 + floor(year_month_gs/12)
bys placekey year: drop if _n != 1
gen num_poi = 1

fillin placekey year
sort placekey year
bys placekey: gen exit = 1 if num_poi[_n] == . & num_poi[_n-1] != .
bys placekey: gen entry = 1 if num_poi[_n] != . & num_poi[_n-1] == .

collapse (sum) num_poi entry exit, by(year) 

** create entry and exit rates rates following how BDS does it
tsset year
gen sg_exit_rate = 100*exit/((num_poi + l.num_poi)/2)
gen sg_entry_rate = 100*entry/((num_poi + l.num_poi)/2)

** merge with BDS entry and exit
merge 1:1 year using `entry_exit', nogen 

sort year
keep year sg_entry_rate sg_exit_rate estabs_entry_rate estabs_exit_rate
drop if year == 2018

* Look at average of rates for 2020-2022
sum sg_entry_rate estabs_entry_rate sg_exit_rate estabs_exit_rate if year >= 2020


**************************
**** Employees per Establishment - Census County Business Patterns 

** 2019
import delimited "${raw_root}/county_business_patterns/cbp19co.txt", clear

keep if substr(naics,1,4) == "7225"
collapse (sum) emp est, by(naics)
gen emp_per_est = emp/est
sum emp_per_est if naics == "7225//"

** 2022 
import delimited "${raw_root}/county_business_patterns/cbp22co.txt", clear

keep if substr(naics,1,4) == "7225"
collapse (sum) emp est, by(naics)
gen emp_per_est = emp/est
sum emp_per_est if naics == "7225//"
